The actual concrete compressive strength (MPa) for a given mixture under a specific age (days) was determined from laboratory. Data is in raw form (not scaled). The data has 8 quantitative input variables, and 1 quantitative output variable, and 1030 instances (observations).
Cement manufacturing
Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. These ingredients include cement, blast furnace slag, fly ash, water, superplasticizer, coarse aggregate, and fine aggregate.
Modelling of strength of high performance concrete using Machine Learning
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# various scaling algorithms
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split # to split the dataset for training and testing
from sklearn.model_selection import cross_val_score # for performing cross validation of models
from sklearn.svm import SVC # Support vector machine model
from sklearn.utils import shuffle
from sklearn import metrics # to get various evaluation metrics
from sklearn.metrics import roc_auc_score # receiver operating curve score
from sklearn.metrics import accuracy_score # accuracy of prediction score
from sklearn.metrics import recall_score # recall score
from sklearn.metrics import precision_score # precision score
from sklearn.metrics import f1_score # f1 score
from sklearn.decomposition import PCA # performing principal components analysis
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
# read the dataset to a dataframe
cm_df = pd.read_csv('concrete.csv')
# taking a look at the first 10 rows of the dataframe, already seeing some missing values
cm_df.head(10)
cm_df.shape # dimensions of the dataframe
This is a regression problem containing all 8 predictor variables and a target variable.
# To check for null values
cm_df.isna().sum()
There are no null values in the dataset provided to us
cm_df.info() # basic info such as datatype, value types
# Getting the 5 point summary
cm_df.describe().T
#Checing how many rows have atleast one 0 value
(~cm_df.all(1)).sum()
# To check for correlation
colormap = plt.cm.magma
plt.figure(figsize=(20,20))
sns.heatmap(cm_df.corr(), linewidths=0.1, vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
cm_df.corr()
#cm_df.median()
#To impute missing values
#impute = SimpleImputer( missing_values = 0, strategy='median')
#impute = impute.fit(df[:,0:18])
#cm_df.iloc[:,0:9] = impute.fit_transform(cm_df.iloc[:,0:9])
# Doing a boxplot on the whole data to check
sns.boxplot( data=cm_df, orient= "h" )
plt.figure(figsize = (15,66))
cols = cm_df.columns.values
i=0 # column counter
j=1 # plot counter
k=1 # plot counter of each variable
while i < (len(cols) - 1):
if k == 1:
plt.subplot(9,2,j)
sns.distplot(cm_df[cols[i]])
j+=1
k+=1
plt.title(f'Distribution of {cols[i]}')
plt.xlabel(f'{cols[i]}')
plt.ylabel('Frequency')
else:
plt.subplot(9,2,j)
sns.boxplot(cm_df[cols[i]], color='yellowgreen')
j+=1
q1, q3 = np.percentile(cm_df[cols[i]],[25,75])
IQR = q3 - q1
plt.title(f'Boxplot of {cols[i]} \n \u03bc = {round(cm_df[cols[i]].mean(), 3)}, SE = {round(cm_df[cols[i]].std(),4)}, Median = {round(cm_df[cols[i]].median(),3)}, IQR = {round(IQR, 3)} ')
plt.xlabel(f'{cols[i]}')
plt.ylabel('Frequency')
i+=1
k=1
sns.pairplot(cm_df, diag_kind='kde')
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(cm_df))
print(z)
print(np.where(z>3))
for col in ['slag','water','fineagg','superplastic']:
percentiles = cm_df[col].quantile([0.01,0.99]).values
cm_df[col] = np.clip(cm_df[col], percentiles[0], percentiles[1])
#percentiles = cm_df['superplastic'].quantile([0.01,0.99]).values
#cm_df['superplastic'] = np.clip(cm_df['superplastic'], percentiles[0], percentiles[1])
#percentiless = cm_df['fineagg'].quantile([0.023,0.977]).values
#percentiless
#cm_df['slag'] = np.clip(cm_df['slag'], percentiless[0], percentiless[1])
#cm_df = cm_df[(z < 3).all(axis=1)]
plt.figure(figsize = (15,66))
cols = cm_df.columns.values
i=0 # column counter
j=1 # plot counter
k=1 # plot counter of each variable
while i < (len(cols) - 1):
if k == 1:
plt.subplot(9,2,j)
sns.distplot(cm_df[cols[i]])
j+=1
k+=1
plt.title(f'Distribution of {cols[i]}')
plt.xlabel(f'{cols[i]}')
plt.ylabel('Frequency')
else:
plt.subplot(9,2,j)
sns.boxplot(cm_df[cols[i]], color='yellowgreen')
j+=1
q1, q3 = np.percentile(cm_df[cols[i]],[25,75])
IQR = q3 - q1
plt.title(f'Boxplot of {cols[i]} \n \u03bc = {round(cm_df[cols[i]].mean(), 3)}, SE = {round(cm_df[cols[i]].std(),4)}, Median = {round(cm_df[cols[i]].median(),3)}, IQR = {round(IQR, 3)} ')
plt.xlabel(f'{cols[i]}')
plt.ylabel('Frequency')
i+=1
k=1
Now the outliers are gone
cm_df.corr()
Added below new features after doing some domain research
cm_df['wc_ratio'] = cm_df['water'] / cm_df['cement']
cm_df['wb_ratio'] = cm_df['water'] / ( cm_df['water'] + cm_df['cement'] )
Not dropping columns at the moment
colormap = plt.cm.magma
plt.figure(figsize=(20,20))
sns.heatmap(cm_df.corr(), linewidths=0.1, vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
Thus we can see there is a very strong inverse correlation betweem the strength and the new features
cm_df.corr()
sns.pairplot(cm_df, diag_kind='kde')
The pairplots justify the addition of new features
X = cm_df.drop(['strength'], axis = 1)
y = cm_df['strength']
X_train, X_test, y_train, y_test = train_test_split(X , y ,test_size = 0.30 , random_state = 1234)
scale = StandardScaler() # Standard scaling
scale.fit(X_train.iloc[:,:])# fitting on training data so the data integrity in test is maintained
X_train.iloc[:,:] = scale.transform(X_train.iloc[:,:])
X_test.iloc[:,:] = scale.transform(X_test.iloc[:,:])
X_train
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
for idx, col in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col, reg_model.coef_[idx]))
reg_model.coef_
intercept = reg_model.intercept_
print("The intercept for our model is {}".format(intercept))
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
dt_model = DecisionTreeRegressor( max_depth=10)
dt_model.fit(X_train, y_train)
dt_model.score(X_test, y_test)
dt_model.score(X_train, y_train)
#Linear Regression
print(reg_model.score(X_train, y_train))
print(reg_model.score(X_test, y_test))
# Ridge Regression
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
# Lasso Regression
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
rfTree = RandomForestRegressor(n_estimators=50)
rfTree.fit(X_train,y_train)
print("rfTree on train data ", rfTree.score(X_train,y_train))
print("rfTree on test data ", rfTree.score(X_test, y_test))
#X_train = X_train.drop(['coarseagg','wc_ratio'], axis = 1 , inplace= True)
#X_test = X_test.drop(['coarseagg','wc_ratio'], axis = 1 , inplace= True)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)
X_train.shape
X_test.shape
#X_train, X_test, y_train, y_test = train_test_split(X_poly , y ,test_size = 0.30 , random_state = 1)
X_train
X_train
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
#Linear Regression
print(reg_model.score(X_train, y_train))
print(reg_model.score(X_test, y_test))
# Ridge Regression
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
# Lasso Regression
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
rfTree = RandomForestRegressor(n_estimators=50)
rfTree.fit(X_train,y_train)
print("rfTree on train data ", rfTree.score(X_train,y_train))
print("rfTree on test data ", rfTree.score(X_test, y_test))
dt_model = DecisionTreeRegressor( max_depth=5)
dt_model.fit(X_train, y_train)
dt_model.score(X_test, y_test)
dt_model.score(X_train, y_train)
X_train.shape
cm_df_z = cm_df.apply(zscore) # standardised score
# cluster analysis using Kmeans
cluster_range = range( 2, 15 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 10 )
clusters.fit(cm_df_z)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df # checking errors for various clusters created
# Elbow plot to find optimal cluster
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
Taking clusters 5 & 4 seems reasonable
kmeans = KMeans(n_clusters= 5)
kmeans.fit(cm_df_z)
labels = kmeans.labels_
counts = np.bincount(labels[labels>=0])
print(counts)
## creating a new dataframe only for labels and converting it into categorical variable
cluster_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
cluster_labels['labels'] = cluster_labels['labels'].astype('category')
cm_df_labeled = cm_df.join(cluster_labels)
cm_df_labeled.boxplot(by = 'labels', layout=(6,2), figsize=(20, 60))
prediction=kmeans.predict(cm_df_z)
cm_df_z["GROUP"] = prediction # Creating a new column "GROUP" which will hold the cluster id of each record
cm_df_z_copy = cm_df_z.copy(deep = True) # Creating a mirror copy for later re-use instead of building repeatedly
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(cm_df) )
centroid_df
cm_df_z.boxplot(by = 'GROUP', layout=(4,3), figsize=(15, 25))
var = 'cement'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'water'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'ash'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'slag'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'coarseagg'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'fineagg'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'age'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'superplastic'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'wc_ratio'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'wb_ratio'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z,hue='GROUP')
plot.set(ylim = (-3,3))
cm_df_z2 = cm_df.apply(zscore)
kmeans = KMeans(n_clusters= 4, random_state= 1)
kmeans.fit(cm_df_z2)
labels = kmeans.labels_
counts = np.bincount(labels[labels>=0])
print(counts)
prediction=kmeans.predict(cm_df_z2)
cm_df_z2["GROUP"] = prediction # Creating a new column "GROUP" which will hold the cluster id of each record
cm_df_z2_copy = cm_df_z2.copy(deep = True) # Creating a mirror copy for later re-use instead of building repeatedly
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(cm_df) )
centroid_df
cm_df_z2.boxplot(by = 'GROUP', layout=(4,3), figsize=(15, 25))
var = 'cement'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'water'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'ash'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'slag'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'coarseagg'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'fineagg'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'age'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'superplastic'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'wc_ratio'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
var = 'wb_ratio'
with sns.axes_style("white"):
plot = sns.lmplot(var,'strength',data=cm_df_z2,hue='GROUP')
plot.set(ylim = (-3,3))
X = cm_df.drop(['strength'], axis = 1)
y = cm_df['strength']
scale2 = StandardScaler() # Standard scaling
scale2.fit(X.loc[:,:])# fitting on training data so the data integrity in test is maintained
X_scaled = scale2.transform(X.loc[:,:])
#should give an 18*18 matrix
covMatrix = np.cov(X_scaled,rowvar=False)
print(covMatrix)
#Performing PCA on all of its 10 predictors
pca = PCA(n_components=10)
pca.fit(X_scaled)
plt.bar(range(1,11), pca.explained_variance_ratio_, alpha = 0.5, align='center', label='individual explained variance')
plt.step( range(1,11), np.cumsum(pca.explained_variance_ratio_), where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
df_comp = pd.DataFrame(pca.components_,columns=list(X.columns.values))
df_comp
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma')
X = cm_df.drop(['strength'], axis = 1)
y = cm_df['strength']
X_train, X_test, y_train, y_test = train_test_split(X , y ,test_size = 0.20 , random_state = 1)
X_train, X_val, y_train, y_val = train_test_split(X_train , y_train ,test_size = 0.25 , random_state = 1)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
scale = StandardScaler() # Standard scaling
scale.fit(X_train.iloc[:,:])# fitting on training data so the data integrity in test is maintained
X_train.iloc[:,:] = scale.transform(X_train.iloc[:,:])
X_val.iloc[:,:] = scale.transform(X_val.iloc[:,:])
X_test.iloc[:,:] = scale.transform(X_test.iloc[:,:])
gbmTree = GradientBoostingRegressor(n_estimators=50)
gbmTree.fit(X_train,y_train)
print("gbmTree on training" , gbmTree.score(X_train, y_train))
print("gbmTree on test data ",gbmTree.score(X_val,y_val))
print("gbmTree on test data ",gbmTree.score(X_test,y_test))
# View a list of the features and their importance scores
importances = gbmTree.feature_importances_
indices = np.argsort(importances)[::-1][:10]
a = cm_df.columns[:]
features= a.drop(['strength'],1)
#plot it
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
rfTree = RandomForestRegressor(n_estimators=50)
rfTree.fit(X_train,y_train)
print("rfTree on train data ", rfTree.score(X_train,y_train))
print("rfTree on test data ", rfTree.score(X_val,y_val))
print("gbmTree on test data ",gbmTree.score(X_test,y_test))
# View a list of the features and their importance scores
importances = rfTree.feature_importances_
indices = np.argsort(importances)[::-1][:10]
a = cm_df.columns[:]
features= a.drop(['strength'],1)
#plot it
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
X = cm_df.drop(['strength','coarseagg','ash','cement','wc_ratio'], axis = 1)
y = cm_df['strength']
X_train, X_test, y_train, y_test = train_test_split(X , y ,test_size = 0.20 , random_state = 1)
X_train, X_val, y_train, y_val = train_test_split(X_train , y_train ,test_size = 0.25 , random_state = 1)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
scale = StandardScaler() # Standard scaling
scale.fit(X_train.iloc[:,:])# fitting on training data so the data integrity in test is maintained
X_train.iloc[:,:] = scale.transform(X_train.iloc[:,:])
X_val.iloc[:,:] = scale.transform(X_val.iloc[:,:])
X_test.iloc[:,:] = scale.transform(X_test.iloc[:,:])
svreg = SVR()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
br = BaggingRegressor()
# Checking the hyperparameters present in the models for clarity
for clf, label in zip([svreg , rfr, gbr, br], ['svreg','rfr','gbr','br']):
print("model name: " , label)
print("\n model_hyperparameters \n" , clf.get_params() )
svreg_gs_param_grid = {
'C' : [0.01, 0.1 , 1, 10,20, 30 , 50 , 100,200,400,500,1000],
'gamma' : ['auto','scale'],
'kernel' : ['poly','rbf']
}
# run grid search
grid_search = GridSearchCV(estimator=svreg, param_grid=svreg_gs_param_grid, cv=10)
#start = time()
grid_search.fit(X_train, y_train)
#Getting best parameters
grid_search.best_params_
print(grid_search.cv_results_['mean_test_score'])
grid_search.best_estimator_
svregressor = SVR( C=200, gamma='auto')
svregressor.fit(X_train, y_train)
print("SVR on train data ", svregressor.score(X_train,y_train))
print("SVR on validation data ", svregressor.score(X_val,y_val))
print("SVR on test data ", svregressor.score(X_test,y_test))
rfr_gs_param_grid = {
'n_estimators' : [10, 50, 100, 200],
'max_depth': range(5,10),
'criterion': ['mse','mae'],
'min_samples_leaf' : range(1,4),
'max_features':['auto','sqrt']
}
# run grid search
grid_search = GridSearchCV(estimator=rfr, param_grid=rfr_gs_param_grid, cv=10)
#start = time()
grid_search.fit(X_train, y_train)
# Getting the nest parameters
grid_search.best_params_
rfTree = RandomForestRegressor(n_estimators=50, max_depth=11, max_features='auto',min_samples_leaf=1, criterion='mse')
rfTree.fit(X_train,y_train)
print("Random Forrest on train data ", rfTree.score(X_train,y_train))
print("Random Forrest on validation data ", rfTree.score(X_val,y_val))
print("Random Forrest on validation data ", rfTree.score(X_test,y_test))
gbr_gs_param_grid = {
'n_estimators' : [50, 100],
'max_depth': range(5,10),
'criterion': ['mse','mae'],
'min_samples_leaf' : range(1,4),
'max_features':['auto','sqrt'],
'learning_rate' : [0.001, 0.01, 0.05,0.1]
}
# run grid search
grid_search = GridSearchCV(estimator=gbr, param_grid=gbr_gs_param_grid, cv=10)
#start = time()
grid_search.fit(X_train, y_train)
# Getting the best parameters
grid_search.best_params_
gbmTree = GradientBoostingRegressor(n_estimators=100, criterion='mse', learning_rate=0.1, max_depth=6, max_features='sqrt',
min_samples_leaf=3)
gbmTree.fit(X_train,y_train)
print("Gradient Boost on training data" , gbmTree.score(X_train, y_train))
print("Gradient Boost on validation data ",gbmTree.score(X_val,y_val))
print("Gradient Boost on test data ",gbmTree.score(X_test,y_test))
br_gs_param_grid = {
'n_estimators' : [50, 100, 200, 400, 500, 1000],
'max_features':range(1,6),
'bootstrap':[True, False]
}
# run grid search
grid_search = GridSearchCV(estimator=br, param_grid=br_gs_param_grid, cv=10)
#start = time()
grid_search.fit(X_train, y_train)
# Getting the best parameters
grid_search.best_params_
print(grid_search.cv_results_['mean_test_score'])
bgcl = BaggingRegressor(n_estimators=500, bootstrap= True, max_features=5)
bgcl = bgcl.fit(X_train,y_train)
print("bgcl on train data ", bgcl.score(X_train,y_train))
print("bgcl on validation data ", bgcl.score(X_val,y_val))
print("bgcl on test data ", bgcl.score(X_test,y_test))
#print("out of bag score" , bgcl.oob_score_)
svreg = SVR()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
br = BaggingRegressor()
br_rs_param_grid = {
'n_estimators' : [50, 100, 200, 400, 500, 1000],
'max_features':range(1,6),
'bootstrap':[True, False]
}
# run randomized search
samples = 60 # number of random samples
randomCV = RandomizedSearchCV(br, param_distributions=br_rs_param_grid, n_iter=samples) #default cv = 3
randomCV.fit(X_train, y_train)
print(randomCV.best_params_)
bgcl = BaggingRegressor(n_estimators=500, bootstrap= True, max_features=5)
bgcl = bgcl.fit(X_train,y_train)
print("bgcl on train data ", bgcl.score(X_train,y_train))
print("bgcl on test data ", bgcl.score(X_val,y_val))
print("bgcl on test data ", bgcl.score(X_test,y_test))
#print("out of bag score" , bgcl.oob_score_)
gbr_rs_param_grid = {
'n_estimators' : [50, 100, 200, 400, 500, 1000],
'max_depth': range(4,7),
'criterion': ['mse','mae'],
'min_samples_leaf' : sp_randint(1, 8),
'max_features':sp_randint(1,6),
'loss' : ['ls', 'lad', 'huber', 'quantile'],
'learning_rate' : [0.001, 0.01, 0.05,0.1, 0.2,0.3]
}
# run randomized search
samples = 1500 # number of random samples
randomCV = RandomizedSearchCV(gbr, param_distributions=gbr_rs_param_grid, n_iter=samples) #default cv = 3
randomCV.fit(X_train, y_train)
print(randomCV.best_params_)
gbmTree = GradientBoostingRegressor( criterion='mse', learning_rate = 0.05, loss= 'huber', max_depth= 5,
max_features= 2, min_samples_leaf= 7, n_estimators= 1000)
gbmTree.fit(X_train,y_train)
print("gbmTree on training" , gbmTree.score(X_train, y_train))
print("gbmTree on validation data ",gbmTree.score(X_val,y_val))
print("gbmTree on test data ",gbmTree.score(X_test,y_test))
rfr_rs_param_grid = {
'n_estimators' : [50, 100, 200, 400, 500, 1000],
'max_depth': range(5,10),
'criterion': ['mse','mae'],
'min_samples_leaf' : sp_randint(1, 5),
'max_features':sp_randint(1, 6)
}
# run randomized search
samples = 1000 # number of random samples
randomCV = RandomizedSearchCV(rfr, param_distributions=rfr_rs_param_grid, n_iter=samples) #default cv = 3
randomCV.fit(X_train, y_train)
print(randomCV.best_params_)
rfTree = RandomForestRegressor(n_estimators=500, max_depth=9, max_features=5 ,min_samples_leaf=1, criterion='mse')
rfTree.fit(X_train,y_train)
print("rfTree on train data ", rfTree.score(X_train,y_train))
print("rfTree on validation data ", rfTree.score(X_val,y_val))
print("rfTree on test data ", rfTree.score(X_test,y_test))
svreg_rs_param_grid = {
'C' : [0.01, 0.1 , 1, 10,20, 30 , 50 , 100, 200, 300, 500],
'gamma' : [0.01, 0.1,0.05,0.5,1,10,20, 'auto', 'scale']
}
# run randomized search
samples = 99 # number of random samples
randomCV = RandomizedSearchCV(svreg, param_distributions=svreg_rs_param_grid, n_iter=samples) #default cv = 3
randomCV.fit(X_train, y_train)
print(randomCV.best_params_)
svregressor = SVR( C=500, gamma=0.1)
svregressor.fit(X_train, y_train)
print("SVR on train data ", svregressor.score(X_train,y_train))
print("SVR on validation data ", svregressor.score(X_val,y_val))
print("SVR on test data ", svregressor.score(X_test,y_test))
y_test
# View a list of the features and their importance scores
importances = gbmTree.feature_importances_
indices = np.argsort(importances)[::-1][:6]
a = cm_df.columns[:]
features= a.drop(['strength','coarseagg','ash','cement','wc_ratio'],1)
#plot it
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
Xdf = cm_df.drop(['coarseagg','ash','cement','wc_ratio'], axis = 1)
strength = Xdf['strength']
Xdf.drop(labels=['strength'], axis=1, inplace=True)
Xdf.insert(6,'strength',strength )
scale = StandardScaler() # Standard scaling
scale.fit(Xdf.iloc[:,:-1])# fitting on training data so the data integrity in test is maintained
Xdf.iloc[:,:-1] = scale.transform(Xdf.iloc[:,:-1])
Xdf.head()
val = Xdf.values
len(Xdf)
val
# configure bootstrap
n_iterations = 1000 # Number of bootstrap samples to create
n_size = int(len(Xdf) * 0.70) # picking only 50 % of the given data in every bootstrap sample
# run bootstrap
stats = list()
for i in range(n_iterations):
# prepare train and test sets
train = resample(val, n_samples=n_size) # Sampling with replacement
test = np.array([x for x in val if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit model
model = GradientBoostingRegressor( criterion='mse', learning_rate = 0.05, loss= 'huber', max_depth= 5,
max_features= 2, min_samples_leaf= 7, n_estimators= 1000)
model.fit(train[:,:-1], train[:,-1])
# evaluate model
#predictions = model.predict(test[:,:-1])
score = model.score(test[:,:-1], test[:,-1]) # caution, overall accuracy score can mislead when classes are imbalanced
print(score)
stats.append(score)
# plot scores
plt.hist(stats)
plt.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.3f confidence interval %.3f%% and %.3f%%' % (alpha*100, lower*100, upper*100))
X = cm_df.drop(['strength','coarseagg','ash','cement','wc_ratio'], axis = 1)
y = cm_df['strength']
X.apply(zscore)
kfold = KFold(n_splits=10)
model = GradientBoostingRegressor( criterion='mse', learning_rate = 0.05, loss= 'huber', max_depth= 5,
max_features= 2, min_samples_leaf= 7, n_estimators= 1000)
results = cross_val_score(model, X, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
# To find interval at 95% confidence
std95 = results.std() * 1.96
min_interval = results.mean() - std95
max_interval = results.mean() + std95
print('We can inform the stakeholders that the model can be said to perform between {min:.3f}% & {max:.3f}% at 95% confidence'.
format(min = min_interval*100.0, max = max_interval*100.0))